篇首语:本文由编程笔记#小编为大家整理,主要介绍了linux安装prometheus+grafana+alermanager相关的知识,希望对你有一定的参考价值。
软件版本 |
├── grafana-enterprise-8.0.5-1.x86_64.rpm └── prometheus-2.32.1.linux-amd64.tar.gz └── node_exporter-1.3.1.linux-amd64.tar.gz └── alertmanager-0.23.0.linux-amd64.tar.gz |
系统内核版本 | 系统版本 |
2.6.32-642.el6.x86_64 | CentOS release 6.8 (Final) |
3.10.0-1160.el7.x86_64 | CentOS Linux release 7.9.2009 (Core) |
将软件上传至服务器
[root@master ~]# tar
zxvf prometheus-2.32.1.linux-amd64.tar.gz -C /home
[root@master ~]# mv
/home/prometheus-2.32.1.linux-amd64/ /home/prometheus-2.32.1
[root@master ~]# cd
/home/prometheus-2.32.1/ && nohup ./prometheus &
[root@master ~]#
netstat -ntpl
Active Internet connections (only servers) Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 1360/sshd tcp 0 0 :::9090 :::* LISTEN 1528/./prometheus #服务启动成功 tcp 0 0 :::22 :::* LISTEN 1360/sshd |
添加为服务并设置开机自启动
[root@master ~]# cat > /usr/lib/systemd/system/prometheus.service < [Unit] Description=https://prometheus.io [Service] Restart=on-failure ExecStart=/home/prometheus-2.32.1/prometheus --config.file=/home/prometheus-2.32.1/prometheus.yml [Install] WantedBy=multi-user.target EOF |
将 node_exporter-1.3.1.linux-amd64.tar.gz上传至客户端
[root@master ~]# tar
zxvf node_exporter-1.3.1.linux-amd64.tar.gz -C /home
[root@master ~]# mv
/home/node_exporter-1.3.1.linux-amd64/ /home/node_exporter-1.3.1
[root@master ~]# cd
/home/node_exporter-1.3.1/ && nohup ./node_exporter &
[root@master ~]#
netstat -ntpl
Active Internet connections (only servers) Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 961/sshd tcp 0 0 127.0.0.1:25 0.0.0.0:* LISTEN 1150/master tcp6 0 0 :::9100 :::* LISTEN 2153/./node_exporte tcp6 0 0 :::22 :::* LISTEN 961/sshd tcp6 0 0 ::1:25 :::* LISTEN 1150/master tcp6 0 0 :::9090 :::* LISTEN 2130/./prometheus |
添加为服务并设置开机自启动
[root@master ~]# cat > /etc/systemd/system/node_exporter.service < [Unit] Description=node_exporter Documentation=https://prometheus.io/docs/introduction/overview After=network-online.target remote-fs.target nss-lookup.target Wants=network-online.target [Service] Type=simple PIDFile==/var/run/node_exporter.pid ExecStart=/home/node_exporter-1.3.1/node_exporter ExecReload=/bin/kill -s HUP $MAINPID ExecStop=/bin/kill -s TERM $MAINPID [Install] WantedBy=multi-user.target EOF |
[root@master ~]# vim /home/prometheus-2.32.1/prometheus.yml
添加以下信息,要注意格式 - job_name: "linux" static_configs: - targets: ["192.168.1.60:9100","192.168.1.61:9100","192.168.1.62:9100"] |
设置开机启动
[root@master ~]#
systemctl daemon-reload
[root@master ~]#
systemctl enable node_exporter
[root@master ~]#
systemctl start node_exporter
[root@master ~]#
systemctl enable prometheus
[root@master ~]#
systemctl start prometheus
查看状态
[root@master ~]# yum -y install urw-fonts
[root@master ~]# rpm -ivh grafana-enterprise-8.0.5-1.x86_64.rpm
warning: grafana-enterprise-8.0.5-1.x86_64.rpm: Header V4 RSA/SHA256 Signature, key ID 24098cb6: NOKEY Preparing... ################################# [100%] Updating / installing... 1:grafana-enterprise-8.0.5-1 ################################# [100%] ### NOT starting on installation, please execute the following statements to configure grafana to start automatically using systemd sudo /bin/systemctl daemon-reload sudo /bin/systemctl enable grafana-server.service ### You can start grafana-server by executing sudo /bin/systemctl start grafana-server.service POSTTRANS: Running script |
[root@master ~]# systemctl enable grafana-server
[root@master~]# systemctl start grafana-server
[root@master ~]# netstat -ntpl
[root@master ~]#
grafana-cli plugins install grafana-piechart-panel
2.重启grafana-server
[root@master ~]#
systemctl restart grafana-server
打开grafans网页
prometheus+grafana安装完成
[root@master~ ]# tar zxvf alertmanager-0.23.0.linux-amd64.tar.gz -C /home/
[root@master~ ]# mv /home/alertmanager-0.23.0.linux-amd64 /home/alertmanager-0.23.0
[root@master~ ]# cd /home/alertmanager-0.23.0
[root@master alertmanager-0.23.0]# cp alertmanager.yml alertmanager.yml-back
[root@master alertmanager-0.23.0]# vim alertmanager.yml
global: resolve_timeout: 5m smtp_smarthost: smtp.163.com:25 smtp_from: 18600000000@163.com smtp_auth_username: 18600000000@163.com smtp_auth_password: XXXXXXXXXX smtp_require_tls: false route: group_by: [alertname] group_wait: 10s group_interval: 1m repeat_interval: 3m receiver: mail receivers: - name: mail email_configs: - to: 58888888@qq.com headers: Subject: "[WARN] 报警邮件" send_resolved: true inhibit_rules: - source_match: severity: critical target_match: severity: warning equal: [alertname, dev, instance] |
如图:
[root@master ~ ]# mkdir /home/alertmanager-0.23.0/rules
[root@master ~ ]# vim /home/alertmanager-0.23.0/rules/node.yml
groups: - name: 主机状态-监控告警 rules: - alert: 主机状态 expr: up == 0 for: 5m labels: status: 非常严重 annotations: summary: "$labels.instance:服务器宕机" description: "$labels.instance:服务器延时超过5分钟" - alert: CPU使用情况 expr: 100-(avg(irate(node_cpu_seconds_totalmode="idle"[5m])) by(instance)* 100) > 80 for: 1m labels: status: 一般告警 annotations: summary: "$labels.instance CPU使用率过高!" description: "$labels.instance CPU使用大于80%(目前使用:$value%)" - alert: 内存使用 expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 80 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance 内存使用率过高!" description: "$labels.instance 内存使用大于80%(目前使用:$value%)" - alert: IO性能 expr: (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) > 80 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance 流入磁盘IO使用率过高!" description: "$labels.instance 流入磁盘IO大于80%(目前使用:$value)" - alert: 网络 expr: ((sum(rate (node_network_receive_bytes_totaldevice!~tap.*|veth.*|br.*|docker.*|virbr*|lo*[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance 流入网络带宽过高!" description: "$labels.instance 流入网络带宽持续2分钟高于100M. RX带宽使用率$value" - alert: 网络 expr: ((sum(rate (node_network_transmit_bytes_totaldevice!~tap.*|veth.*|br.*|docker.*|virbr*|lo*[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance 流出网络带宽过高!" description: "$labels.instance 流出网络带宽持续2分钟高于100M. RX带宽使用率$value" - alert: TCP会话 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance TCP_ESTABLISHED过高!" description: "$labels.instance TCP_ESTABLISHED大于1000(目前使用:$value%)" - alert: 磁盘容量 expr: 100-(node_filesystem_free_bytesfstype=~"ext4|xfs"/node_filesystem_size_bytes fstype=~"ext4|xfs"*100) > 80 for: 1m labels: status: 严重告警 annotations: summary: "$labels.instance 磁盘分区使用率过高!" description: "$labels.instance 磁盘分区使用大于80%(目前使用:$value%)" |
修改prometheus.yml配置文件,配置报警规则,打开alerting
和 rule_files 文件指定(增加红色字体部分)
[root@master ~]# vim /home/prometheus-2.32.1/prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: ["192.168.1.60:9093"] # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global evaluation_interval. rule_files: - "/home/alertmanager-0.23.0/rules/*.yml" # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here its Prometheus itself. scrape_configs: # The job name is added as a label `job= - job_name: "prometheus" #抓取间隔时间 scrape_interval: 5s # metrics_path defaults to /metrics # scheme defaults to http. static_configs: - targets: ["localhost:9090","192.168.2.231:9100"] - job_name: "linux-host" scrape_interval: 5s # metrics_path defaults to /metrics # scheme defaults to http. static_configs: - targets: ["192.168.2.230:9100"] |
重启服务
[root@master ~]# cat > /home/alertmanager-0.23.0/start_alertmanager.sh < #!/bin/bash nohup /home/alertmanager-0.23.0/alertmanager --config.file="/home/alertmanager-0.23.0/alertmanager.yml" > /home/alertmanager-0.23.0/alertmanager.log 2>&1 & EOF |
[root@master ~]# sh
/home/alertmanager-0.23.0/start_alertmanager.sh
[root@master ~]#
netstat -ntpl
重启 prometheus 服务
[root@master ~ ]# kill -9 `ps -ef | grep prometheus | grep -v grep |awk print $2`
[root@master ~ ]# systemctl start prometheus
查看监控项配置信息
[root@node1 ~ ] # kill -9 `ps -ef | grep node_exporter | grep -v grep |awk print $2`
进入邮箱查看告警邮件:
开启node1主机的 node_exporter 服务
收到恢复邮件
[root@master ~]# cat /home/alertmanager-0.23.0/template/wechat.tmpl
(邮件告警模板)
define "wechat.default.message" - if gt (len .Alerts.Firing) 0 - - range $index, $alert := .Alerts - - if eq $index 0 - **********告警通知********** 告警类型: $alert.Labels.alertname 告警级别: $alert.Labels.severity - end ===================== 告警主题: $alert.Annotations.summary 告警详情: $alert.Annotations.description 故障时间: $alert.StartsAt.Local.Format "2006-01-02 15:04:05" if gt (len $alert.Labels.instance) 0 -故障实例: $alert.Labels.instance - end - - end - end - if gt (len .Alerts.Resolved) 0 - - range $index, $alert := .Alerts - - if eq $index 0 - **********恢复通知********** 告警类型: $alert.Labels.alertname 告警级别: $alert.Labels.severity - end ===================== 告警主题: $alert.Annotations.summary 告警详情: $alert.Annotations.description 故障时间: $alert.StartsAt.Local.Format "2006-01-02 15:04:05" 恢复时间: $alert.EndsAt.Local.Format "2006-01-02 15:04:05" if gt (len $alert.Labels.instance) 0 -故障实例: $alert.Labels.instance - end - - end - end - end |
[root@master ~]# cat /home/alertmanager-0.23.0/alertmanager.yml
global: resolve_timeout: 5m smtp_smarthost: smtp.163.com:25 smtp_from: 1866@163.com smtp_auth_username: 1866@163.com smtp_auth_password: RKONLSDFSDMFZM smtp_require_tls: false templates: - /home/alertmanager-0.23.0/template/wechat.tmpl route: group_by: [alertname] group_wait: 10s group_interval: 20s repeat_interval: 3m receiver: mail receivers: - name: mail email_configs: - to: 534234548@qq.com headers: Subject: "[WARN] 报警邮件" send_resolved: true html: template "wechat.default.message" . inhibit_rules: - source_match: severity: critical target_match: severity: warning equal: [alertname, dev, instance] |
查看邮件
[root@master ~]# cat /home/alertmanager-0.23.0/template/wechat.tmpl (微信告警模板)
define "wechat.default.message" - if gt (len .Alerts.Firing) 0 - - range $index, $alert := .Alerts - - if eq $index 0 - **********告警通知********** 告警类型: $alert.Labels.alertname 告警级别: $alert.Labels.severity - end ===================== 告警主题: $alert.Annotations.summary 告警详情: $alert.Annotations.description 故障时间: $alert.StartsAt.Local.Format "2006-01-02 15:04:05" if gt (len $alert.Labels.instance) 0 -故障实例: $alert.Labels.instance - end - - end - end - if gt (len .Alerts.Resolved) 0 - - range $index, $alert := .Alerts - - if eq $index 0 - **********恢复通知********** 告警类型: $alert.Labels.alertname 告警级别: $alert.Labels.severity - end ===================== 告警主题: $alert.Annotations.summary 告警详情: $alert.Annotations.description 故障时间: $alert.StartsAt.Local.Format "2006-01-02 15:04:05" 恢复时间: $alert.EndsAt.Local.Format "2006-01-02 15:04:05" if gt (len $alert.Labels.instance) 0 -故障实例: $alert.Labels.instance - end - - end - end - end |
[root@master ~]# cat /home/alertmanager-0.23.0/alertmanager.yml
global: resolve_timeout: 5m wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ templates: - /home/alertmanager-0.23.0/template/wechat.tmpl route: group_by: [alertname] group_wait: 10s group_interval: 10s repeat_interval: 3m receiver: wechat receivers: - name: wechat wechat_configs: - corp_id: weterterterterhc6 to_party: 1 agent_id: 1000002 api_secret: Ha_wefsdfertgretgerguRCVPnzvK1fY send_resolved: true inhibit_rules: - equal: [alertname, cluster, service] source_match: severity: high target_match: severity: warning |